In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm

import ipyparallel as ipp

from time import time
from datetime import datetime

import motif as mf

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, RepeatedKFold

from scipy.stats import spearmanr
from scipy.stats import pearsonr
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
In [3]:
### set parameters for the motif analysis

PROTEIN_NAME='T7 primase'
PROT_CONC=5 # free protein concentration at binding reation; PBM typically 0.1 and RNACompete typically 0.002
BOTH_STRANDS=False # wheter both strands are present for binding; True if double-stranded DNA or RNA is used as probes

STAGES=mf.stage(protein=PROTEIN_NAME)
In [4]:
### read data

dfprobes_raw=pd.read_csv('./data/T7/chip_B_favor.csv', sep=';')
print('Columns of imported Data File: %s'%dfprobes_raw.columns)
Columns of imported Data File: Index(['0_nuc', '1_nuc', '2_nuc', '3_nuc', '4_nuc', '5_nuc', '6_nuc', '7_nuc',
       '8_nuc', '9_nuc', '10_nuc', '11_nuc', '12_nuc', '13_nuc', '14_nuc',
       '15_nuc', '16_nuc', '17_nuc', '18_nuc', '19_nuc', '20_nuc', '21_nuc',
       '22_nuc', '23_nuc', '24_nuc', '25_nuc', '26_nuc', '27_nuc', '28_nuc',
       '29_nuc', '30_nuc', '31_nuc', '32_nuc', '33_nuc', '34_nuc', '35_nuc',
       'prim_only_score', 'prim', 'poly', 'seq'],
      dtype='object')
In [5]:
### select columns for probe sequence and signal

column_sequence='seq'
column_signal='prim_only_score'
#background_signal='mean_background_intensity'
background_signal=None                             #set to None if not needed

#basic preprocessing
dfprobes_raw[column_signal]=dfprobes_raw[column_signal].apply(lambda a: np.NaN if a==' ' else a)
dfprobes_raw[column_sequence]=dfprobes_raw[column_sequence].apply(lambda a: np.NaN if a=='nan' else a)
dfprobes_raw=dfprobes_raw.dropna()

#construct new dataframe with only necessary data
if type(background_signal)==type(None):
    dfprobes=pd.DataFrame({'seq':dfprobes_raw[column_sequence].astype(str),
                           'signal binding': dfprobes_raw[column_signal].astype(np.float32)}) #rebuild dataframe
else:
    dfprobes=pd.DataFrame({'seq':dfprobes_raw[column_sequence].astype(str),
                           'signal': dfprobes_raw[column_signal].astype(np.float32), 
                           'background':dfprobes_raw[background_signal].astype(np.float32)}) #rebuild dataframe
    dfprobes['signal binding']=dfprobes['signal']-dfprobes['background']
# display main properties of data set

dfprobes['signal binding']=dfprobes['signal binding']**2  # as raw data was squared

dfprobes['signal binding'].plot(figsize=(15,5))
dfprobes.describe()

### check type of nucleic acid

dfprobes['seq']=dfprobes['seq'].apply(lambda seq: seq.upper().replace(" ", ""))  #upper and remove blanks
dfprobes['RNA']=dfprobes['seq'].apply(lambda seq: all(char in 'ACGU' for char in seq))
dfprobes['DNA']=dfprobes['seq'].apply(lambda seq: all(char in 'ACGT' for char in seq))
non_RNA_counts=len(dfprobes[dfprobes['RNA']==False])
non_DNA_counts=len(dfprobes[dfprobes['DNA']==False])

if non_RNA_counts<non_DNA_counts:
    NUC_TYPE='RNA'; print('I: RNA probes detected!')
else: NUC_TYPE='DNA'; print('I: DNA probes detected!')
    
if NUC_TYPE=='RNA' and non_RNA_counts!=0:
    print('E: The probe sequences appear to be RNA, however there are some non-RNA nucleotides in the sequences.')
    print('E: Please check the following sequnces %s'%dfprobes[dfprobes['RNA']==False])

if NUC_TYPE=='DNA' and non_DNA_counts!=0:
    print('E: The probe sequences appear to be RNA, however there are some non-RNA nucleotides in the sequences.')
    print('E: Please check the following sequnces %s'%dfprobes[dfprobes['DNA']==False])
I: DNA probes detected!
In [6]:
### option to add a constant sequence at the 3' end and 5' end
sequence_to_be_added_5='' 
sequence_to_be_added_3='GTCTTG'        # standard PBM arrays: CCTGTGTGAAATTGTTATCCGCTCT T7 array: GTCTTGA..
dfprobes['seq']=sequence_to_be_added_5.upper()+dfprobes['seq']+sequence_to_be_added_3.upper()
print(f"I: The nucleotide sequences {sequence_to_be_added_5.upper()} has been added to the 5' end all probe sequences.")
print(f"I: The nucleotide sequences {sequence_to_be_added_3.upper()} has been added to the 3' end all probe sequences.")
I: The nucleotide sequences  has been added to the 5' end all probe sequences.
I: The nucleotide sequences GTCTTG has been added to the 3' end all probe sequences.
In [7]:
### egalize length
dfprobes['seq_length']=dfprobes['seq'].apply(len)    

if max(dfprobes['seq_length'])!=min(dfprobes['seq_length']):
    print('I: Probes length is not uniform, detected range: %i ..%i' %(min(dfprobes['seq_length']),max(dfprobes['seq_length'])))
    max_length=max(dfprobes['seq_length'])
    dfprobes['padded_sequence']=dfprobes['seq'].apply(lambda seq: seq+((max_length-len(seq))*'-'))
    print("I: Probe sequences have been padded at the 5' to the uniform length of %i nucleotides" %max_length)
else:
    print('I: Probe sequences have the uniform length of %i nucleotides' %(dfprobes['seq_length'].median()))
    dfprobes['padded_sequence']=dfprobes['seq']

print('I: Total datasets contains %i sequences.' % len(dfprobes))

# visualize composition of each position
df_nucleotides=mf.split_sequence_in_nucleotides(dfprobes['padded_sequence'])
dfcount=pd.DataFrame(index=['A', 'C', 'G', 'T', 'U', '-'])
for column in df_nucleotides:
    dfcount[column]=df_nucleotides[column].value_counts() 
dfcount=dfcount.fillna(0) #zeros for NaN
dfcount.transpose().plot(figsize=(15,5), kind='bar')
print('I: Visualisation of the base composition per position')
print('I: If positions are invariant they can be removed before sequence analysis.')
I: Probe sequences have the uniform length of 42 nucleotides
I: Total datasets contains 3149 sequences.
I: Visualisation of the base composition per position
I: If positions are invariant they can be removed before sequence analysis.
In [8]:
# You may remove invariant continuos positions by adjusting the slicing. 
# It is recommended to leave a few invariant positions to allow for binding events
# between the variable and constant part of the probes.

dfprobes['padded_sequence']=dfprobes['padded_sequence'].apply(lambda s:s[:43]) ### <==== do the slicing here

# visualize composition of each position
print('I: Visualisation of the base composition per position after slicing.')
df_nucleotides=mf.split_sequence_in_nucleotides(dfprobes['padded_sequence'])
dfcount=pd.DataFrame(index=['A', 'C', 'G', 'T', 'U', '-'])
for column in df_nucleotides:
    dfcount[column]=df_nucleotides[column].value_counts() 
dfcount=dfcount.fillna(0) #zeros for NaN
dfcount.transpose().plot(figsize=(15,5), kind='bar')
plt.show()

### Preparation for later classification
mean=dfprobes['signal binding'].mean()
std=dfprobes['signal binding'].std()
THRESHOLD=mean+2*std   
dfprobes['positive probe']=dfprobes['signal binding'].apply(lambda s: True if s>THRESHOLD else False)

print('I: The whole dataset has been used to set the threshold for a positive probe.')
print('T: The threshold is %f' %THRESHOLD)
print(f"I: {len(dfprobes[dfprobes['positive probe']])} probes of {len(dfprobes)} are above threshold.")
I: Visualisation of the base composition per position after slicing.
I: The whole dataset has been used to set the threshold for a positive probe.
T: The threshold is 0.663948
I: 196 probes of 3149 are above threshold.
In [9]:
### Shuffle and prepare dataset for training and testing

# shuffle 
dfprobes = shuffle(dfprobes)

# display histogramms of dataset
dfprobes['signal binding'].plot(kind='hist', bins=25).axvline(x=THRESHOLD, color='r', linestyle='-.', lw=0.5, label='threshold classification')

# complete data
X=mf.hotencode_sequence(dfprobes['padded_sequence'], nuc_type=NUC_TYPE)
y=np.array(dfprobes['signal binding'])
In [10]:
#### Perfrom GridCV Search for exploration of the motif length and other model parameters 
# primary goal: identify the minimum motif length which gives a good r-value
# optional: allow also for global optimization to verify whether the local optimization is good enough
# optional: vary G0 to find a good regression model
# optional: fitG0=True. This option adjust G0, so that maximal binding is around 1 (prevents saturation of probes by multiple binding events)


model_grid=mf.findmotif(protein_conc=PROT_CONC, both_strands=BOTH_STRANDS, fit_G0=True)

# prepare grid search over parameters
param_grid = {"motif_length": [1,2,3,4,5,6]}     # choose sensible range for length of motif and other parameters

# run grid search
grid_search = GridSearchCV(model_grid, param_grid=param_grid, verbose=2, 
                           cv=RepeatedKFold(n_splits=5, n_repeats=10),
                           n_jobs=-1,
                           return_train_score=True)

start = time()
grid_search.fit(X, y)
df_grid=pd.DataFrame(grid_search.cv_results_)

print("I: GridSearchCV took %.2f hours for %d candidate parameter settings."
    % ((time() - start)/3600, len(grid_search.cv_results_["params"])))
print('I: number of samples: %i' %len(X))

df_grid=pd.DataFrame(grid_search.cv_results_)

print('I: Plot of r vs motif length')
df_grid['r(train)']=df_grid['mean_train_score'].apply(np.sqrt)
df_grid['r(test)']=df_grid['mean_test_score'].apply(np.sqrt)
df_grid['std(train)']=df_grid['std_train_score'].apply(np.sqrt)
df_grid['std(test)']=df_grid['std_test_score'].apply(np.sqrt)


fig, ax=plt.subplots()
df_grid.plot(ax=ax,kind='scatter', x='param_motif_length', y='r(train)', yerr='std(train)', figsize=(5,3), c='blue').set_xticks(param_grid["motif_length"])
df_grid.plot(ax=ax,kind='scatter', x='param_motif_length', y='r(test)', yerr='std(test)', c='red')
ax.set_ylabel('r')
ax.legend(['train', 'test'])
plt.show()
df_grid[['param_motif_length','r(test)', 'r(train)']]
Fitting 50 folds for each of 6 candidates, totalling 300 fits
I: GridSearchCV took 14.19 hours for 6 candidate parameter settings.
I: number of samples: 3149
I: Plot of r vs motif length
Out[10]:
param_motif_length r(test) r(train)
0 1 0.173419 0.187203
1 2 0.658905 0.660726
2 3 0.666561 0.670032
3 4 0.682361 0.689046
4 5 0.691722 0.697365
5 6 0.694235 0.701794
In [11]:
### run a number of identical optimizations with motif length found during grid search
### goal: find best motif through repetition, judge stabiltiy of optimization

CORE_MOTIF_LENGTH=4  # adjust core motif length if needed, motif length can be changed later

# prepare for ipyparallel
number_of_optimizations = 200
model_list = [mf.findmotif(motif_length=CORE_MOTIF_LENGTH, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS, fit_G0=True)] * number_of_optimizations
X_list = [X] * number_of_optimizations
y_list = [y] * number_of_optimizations


def single_job(model, X, y):
    model.fit(X, y)
    return {'model':model}

# run the optimizations on ipp.cluster
start = time()
with ipp.Cluster(log_level=40) as rc:
    rc[:].use_pickle()
    view = rc.load_balanced_view()
    asyncresult = view.map_async(single_job, model_list, X_list, y_list)
    asyncresult.wait_interactive()
    result = asyncresult.get()
print("I: Optimization took %.2f hours." % ((time() - start) / 3600))


  
# assemble results and analyze
df_repetitions=pd.DataFrame(result)
df_repetitions['r']=df_repetitions['model'].apply(lambda e: mf.linregress(e.predict(X),y).rvalue)
df_repetitions['G0']=df_repetitions['model'].apply(lambda e: e.finalG0_)
df_repetitions['max binding']=df_repetitions['model'].apply(lambda e: e.max_binding_fit)
df_repetitions['min binding']=df_repetitions['model'].apply(lambda e: e.min_binding_fit)
df_repetitions['ratio'] = df_repetitions['model'].apply(lambda e: e.ratio)
df_repetitions['energies']=df_repetitions['model'].apply(lambda e: e.energies_)
df_repetitions['information']=df_repetitions['model'].apply(lambda e: mf.energies2information(e.energies_))


# display results of the ensemble of optimizations
print('I: Results of the repeated motif finding, sorted according to the regression coefficient')
df_repetitions.sort_values('r', ascending=False, inplace=True)
mf.display_df(df_repetitions, nuc_type=NUC_TYPE)
  0%|          | 0/16 [00:00<?, ?engine/s]
single_job:   0%|          | 0/200 [00:00<?, ?tasks/s]
I: Optimization took 10.22 hours.
I: Results of the repeated motif finding, sorted according to the regression coefficient
model r G0 max binding min binding ratio energies information logo
184 suppressed 0.709102 -8369.682846 0.961111 3.057673e-01 3.143276e+00 -6305,.. 2.372494
60 suppressed 0.709079 -6483.433347 0.969193 3.052673e-01 3.174901e+00 -6098,.. 2.434594
68 suppressed 0.708971 -9111.459686 0.970340 3.117814e-01 3.112246e+00 -5960,.. 2.360772
63 suppressed 0.708837 -8917.596818 0.968301 3.115061e-01 3.108449e+00 -4911,.. 2.374628
144 suppressed 0.708727 -9858.288782 0.975751 3.108550e-01 3.138928e+00 -6247,.. 2.281839
114 suppressed 0.708587 -10537.550819 0.977608 3.110741e-01 3.142686e+00 -5771,.. 2.270755
108 suppressed 0.708546 -11716.668946 0.982724 3.243494e-01 3.029831e+00 -5044,.. 2.231729
107 suppressed 0.708505 -10799.940918 0.983687 3.207868e-01 3.066482e+00 -4894,.. 2.240343
98 suppressed 0.708344 -9607.492522 0.961975 3.210375e-01 2.996457e+00 -6141,.. 2.295675
43 suppressed 0.708297 -12027.574721 0.980347 3.232646e-01 3.032646e+00 -4163,.. 2.262535
69 suppressed 0.708239 -12542.382842 0.976880 3.270506e-01 2.986939e+00 -4521,.. 2.152745
75 suppressed 0.708180 -10861.826932 0.971977 3.216252e-01 3.022080e+00 -6545,.. 2.108372
171 suppressed 0.708176 -12574.545868 0.961221 3.168588e-01 3.033594e+00 -4297,.. 2.181432
130 suppressed 0.708125 -10091.505729 0.960229 3.288380e-01 2.920067e+00 -6843,.. 2.222277
96 suppressed 0.708117 -12468.372975 0.978621 3.347994e-01 2.923006e+00 -4713,.. 2.165810
32 suppressed 0.708044 -10567.029564 0.990502 3.309813e-01 2.992622e+00 -6730,.. 2.128858
94 suppressed 0.708014 -6377.124055 0.975092 3.002550e-01 3.247548e+00 -7061,.. 2.493042
14 suppressed 0.707989 -12696.448826 0.978061 3.171902e-01 3.083517e+00 -3945,.. 2.186752
44 suppressed 0.707945 -10909.425345 0.978234 3.277142e-01 2.985023e+00 -6109,.. 2.180312
49 suppressed 0.707899 -11768.069798 0.976074 3.122815e-01 3.125624e+00 -5124,.. 2.075452
28 suppressed 0.707751 -10773.722431 0.989636 3.263602e-01 3.032342e+00 -6965,.. 2.027353
142 suppressed 0.707746 -11610.950337 0.966895 3.193371e-01 3.027819e+00 -5455,.. 2.148604
21 suppressed 0.707740 -12036.418943 0.984355 3.345586e-01 2.942251e+00 -5711,.. 2.061603
8 suppressed 0.707738 -13148.663195 0.978205 3.146517e-01 3.108849e+00 -3911,.. 2.085305
195 suppressed 0.707713 -9692.825219 0.978272 3.199624e-01 3.057459e+00 -7421,.. 2.110664
149 suppressed 0.707620 -13161.544905 0.973257 3.387901e-01 2.872742e+00 -4232,.. 2.094864
87 suppressed 0.707618 -13553.509385 0.977256 3.327196e-01 2.937175e+00 -3862,.. 2.075937
104 suppressed 0.707603 -12612.377303 0.975036 3.181669e-01 3.064541e+00 -4193,.. 2.104714
152 suppressed 0.707585 -11501.727201 0.973607 3.443238e-01 2.827590e+00 -5912,.. 2.155174
141 suppressed 0.707575 -9973.717176 0.985053 3.404881e-01 2.893062e+00 -7111,.. 2.125851
16 suppressed 0.707554 -13159.914853 0.978340 3.379948e-01 2.894541e+00 -4103,.. 2.122826
146 suppressed 0.707519 -11236.377197 0.979721 3.290213e-01 2.977684e+00 -6075,.. 2.101092
65 suppressed 0.707508 -13867.033125 0.978684 3.369704e-01 2.904360e+00 -4009,.. 1.985818
151 suppressed 0.707465 -12529.011975 0.983077 3.272703e-01 3.003867e+00 -3756,.. 2.150554
13 suppressed 0.707361 -12294.144056 0.969659 3.328952e-01 2.912805e+00 -5368,.. 2.036012
120 suppressed 0.707344 -14248.122927 0.974966 3.397490e-01 2.869665e+00 -3806,.. 1.933827
183 suppressed 0.707320 -13547.375868 0.991946 3.190458e-01 3.109102e+00 -4466,.. 1.959040
192 suppressed 0.707311 -14071.357127 0.987226 3.426005e-01 2.881567e+00 -4029,.. 1.942493
172 suppressed 0.707291 -14202.033901 0.985084 3.274552e-01 3.008301e+00 -3611,.. 1.929793
109 suppressed 0.707267 -13278.828687 0.990597 3.262230e-01 3.036563e+00 -4060,.. 2.121135
29 suppressed 0.707244 -11582.318621 0.974590 3.222792e-01 3.024056e+00 -5820,.. 2.091526
70 suppressed 0.707232 -14426.213016 0.977949 3.314377e-01 2.950627e+00 -3647,.. 1.908878
24 suppressed 0.707171 -13452.424348 0.969906 3.256196e-01 2.978647e+00 -4469,.. 1.867069
31 suppressed 0.707094 -14150.017156 0.972750 3.441585e-01 2.826460e+00 -3919,.. 1.960341
188 suppressed 0.707090 -13118.532865 0.984173 3.203848e-01 3.071847e+00 -3875,.. 2.115472
62 suppressed 0.707049 -14663.897962 0.976094 3.358426e-01 2.906403e+00 -3559,.. 1.857300
34 suppressed 0.707022 -14141.075086 0.973570 3.415120e-01 2.850763e+00 -4104,.. 1.887372
161 suppressed 0.706965 -14284.886682 0.974979 3.323667e-01 2.933444e+00 -3876,.. 1.898711
78 suppressed 0.706961 -13842.987845 0.981583 3.305896e-01 2.969190e+00 -4185,.. 1.875746
41 suppressed 0.706938 -14000.601372 0.985296 3.476028e-01 2.834546e+00 -4135,.. 1.951028
89 suppressed 0.706883 -14046.231784 0.970092 3.480889e-01 2.786907e+00 -3878,.. 1.977826
101 suppressed 0.706842 -14628.003671 0.974030 3.352377e-01 2.905489e+00 -3577,.. 1.819633
30 suppressed 0.706813 -14424.234893 0.970130 3.446618e-01 2.814729e+00 -3763,.. 1.887071
119 suppressed 0.706809 -14484.700741 0.978064 3.240691e-01 3.018073e+00 -3443,.. 1.896427
113 suppressed 0.706800 -10651.474175 1.001419 3.185569e-01 3.143612e+00 -6723,.. 1.865427
80 suppressed 0.706772 -14603.283640 0.977040 3.356094e-01 2.911243e+00 -3647,.. 1.819496
166 suppressed 0.706756 -13080.277537 0.967184 3.261860e-01 2.965130e+00 -3613,.. 1.940005
158 suppressed 0.706714 -13754.512853 0.973007 3.125949e-01 3.112678e+00 -4105,.. 1.823113
150 suppressed 0.706676 -13905.816204 0.980039 3.233605e-01 3.030794e+00 -3340,.. 2.017035
17 suppressed 0.706649 -14516.754898 0.992478 3.300992e-01 3.006606e+00 -3786,.. 1.775090
50 suppressed 0.706649 -12385.739588 1.000124 3.285459e-01 3.044093e+00 -5813,.. 1.821217
155 suppressed 0.706577 -14871.704602 0.971259 3.423087e-01 2.837377e+00 -3427,.. 1.836385
39 suppressed 0.706560 -14834.369569 0.984353 3.466659e-01 2.839485e+00 -3633,.. 1.828666
57 suppressed 0.706538 -14560.121811 0.977939 3.508539e-01 2.787310e+00 -3520,.. 1.913195
10 suppressed 0.706332 -14348.438630 0.976600 3.543698e-01 2.755879e+00 -3950,.. 1.891869
190 suppressed 0.706238 -15146.146701 0.979361 3.513099e-01 2.787740e+00 -3404,.. 1.779870
59 suppressed 0.706191 -14736.686310 0.973408 3.386674e-01 2.874229e+00 -3279,.. 1.843195
189 suppressed 0.706132 -14266.456449 0.992509 3.298950e-01 3.008561e+00 -3416,.. 1.956562
132 suppressed 0.706020 -15124.490816 0.991018 3.284176e-01 3.017553e+00 -3341,.. 1.679107
170 suppressed 0.705971 -15037.345084 0.980479 3.519534e-01 2.785820e+00 -3263,.. 1.822034
88 suppressed 0.705764 -15043.458779 0.982262 3.280813e-01 2.993960e+00 -3285,.. 1.662796
117 suppressed 0.705724 -15392.428503 0.985658 3.399299e-01 2.899592e+00 -3077,.. 1.660002
197 suppressed 0.705694 -15355.968739 0.995124 3.378709e-01 2.945280e+00 -3235,.. 1.636952
178 suppressed 0.705510 -11919.848967 0.974559 3.512351e-01 2.774662e+00 -6017,.. 2.081734
176 suppressed 0.704807 -12282.556756 0.987461 3.497943e-01 2.822975e+00 -6343,.. 1.853654
40 suppressed 0.704783 -12441.101308 0.985508 3.538028e-01 2.785473e+00 -5786,.. 1.915172
86 suppressed 0.704288 -16117.681316 0.988564 3.593306e-01 2.751127e+00 -2946,.. 1.499881
66 suppressed 0.703571 -15843.789595 0.986702 3.731855e-01 2.643999e+00 -2915,.. 1.486631
174 suppressed 0.703511 -16671.185196 0.990813 3.838566e-01 2.581205e+00 -2717,.. 1.411135
25 suppressed 0.701207 -17758.332414 0.995580 4.279674e-01 2.326299e+00 -2219,.. 1.100871
79 suppressed 0.701059 -5687.560070 0.977325 3.455221e-01 2.828547e+00 -9669,.. 2.695829
179 suppressed 0.693248 -17615.624014 0.993124 4.555975e-01 2.179827e+00 -2091,.. 1.166737
186 suppressed 0.691515 -16146.337987 1.006219 2.862331e-01 3.515384e+00 952,.. 1.437103
168 suppressed 0.691327 -15643.989028 0.996489 2.532285e-01 3.935136e+00 1011,.. 1.550288
164 suppressed 0.691300 -13069.601688 0.999684 2.780287e-01 3.595614e+00 1013,.. 1.478663
156 suppressed 0.691226 -16913.401053 1.003214 3.380721e-01 2.967457e+00 997,.. 1.218829
199 suppressed 0.690879 -15436.036410 0.993807 2.997857e-01 3.315060e+00 1128,.. 1.381782
26 suppressed 0.690621 -13797.222377 0.995097 3.711335e-01 2.681237e+00 994,.. 1.259845
181 suppressed 0.690596 -15920.557199 0.998980 4.038654e-01 2.473546e+00 1023,.. 1.132092
81 suppressed 0.690278 -17542.369990 1.007353 3.258259e-01 3.091692e+00 2059,.. 1.091616
162 suppressed 0.690084 -12729.382776 0.991340 1.950888e-01 5.081478e+00 910,.. 1.845683
77 suppressed 0.689858 -15244.207265 0.997759 2.320090e-01 4.300517e+00 3003,.. 1.252288
93 suppressed 0.689778 -15784.842901 1.002332 1.925392e-01 5.205860e+00 3793,.. 1.347076
177 suppressed 0.689680 -14850.846865 0.993328 1.568490e-01 6.333020e+00 4782,.. 1.434956
198 suppressed 0.689632 -12986.871927 1.008315 1.267022e-01 7.958144e+00 6869,.. 1.571041
194 suppressed 0.689611 -15602.098567 1.003485 1.592229e-01 6.302395e+00 4810,.. 1.428945
33 suppressed 0.689609 -12278.138207 0.993908 2.039798e-01 4.872583e+00 823,.. 1.806295
105 suppressed 0.689552 -13609.467077 0.992334 1.916921e-01 5.176705e+00 3309,.. 1.355005
58 suppressed 0.689512 -17358.138957 1.005609 4.189782e-01 2.400146e+00 870,.. 1.045546
110 suppressed 0.689484 -13592.997812 1.004792 1.353846e-01 7.421760e+00 5089,.. 1.514374
91 suppressed 0.689472 -15107.581727 1.004360 2.613080e-01 3.843587e+00 1780,.. 1.380881
42 suppressed 0.689452 -14880.210463 1.006580 1.454196e-01 6.921896e+00 5140,.. 1.462380
175 suppressed 0.689431 -15614.763683 1.002749 1.481553e-01 6.768228e+00 5129,.. 1.454054
131 suppressed 0.689420 -14486.356792 1.004350 1.332029e-01 7.540000e+00 9128,.. 1.517692
37 suppressed 0.689415 -10870.138328 0.983454 1.084727e-01 9.066369e+00 7527,.. 1.617936
138 suppressed 0.689389 -15210.219158 1.001756 1.411859e-01 7.095299e+00 7125,.. 1.479178
163 suppressed 0.689322 -17016.173404 1.001110 2.271160e-01 4.407925e+00 3805,.. 1.228709
23 suppressed 0.689285 -15334.121138 0.998403 1.601254e-01 6.235128e+00 6096,.. 1.426229
139 suppressed 0.689275 -14968.738712 0.985607 1.611523e-01 6.115998e+00 5297,.. 1.462571
136 suppressed 0.689245 -15624.305603 0.995569 1.637573e-01 6.079542e+00 3770,.. 1.443532
47 suppressed 0.689201 -13897.680234 1.000110 1.110334e-01 9.007290e+00 6088,.. 1.572170
84 suppressed 0.689147 -13428.710600 0.999244 1.644637e-01 6.075770e+00 4204,.. 1.417729
7 suppressed 0.689108 -16304.036773 1.016735 2.024288e-01 5.022680e+00 5265,.. 1.311394
15 suppressed 0.688938 -16839.026618 0.990299 2.106023e-01 4.702221e+00 3149,.. 1.294219
134 suppressed 0.688918 -16293.358489 1.007546 1.944064e-01 5.182676e+00 6301,.. 1.329639
97 suppressed 0.688906 -17070.717136 0.994390 2.213411e-01 4.492569e+00 4240,.. 1.222828
72 suppressed 0.688881 -16756.222537 0.995600 2.521916e-01 3.947792e+00 3284,.. 1.182881
143 suppressed 0.688853 -17184.192814 1.003904 3.492510e-01 2.874449e+00 724,.. 1.212848
9 suppressed 0.688788 -16929.817999 0.999057 2.282671e-01 4.376702e+00 4598,.. 1.233442
103 suppressed 0.688759 -15021.762842 1.032351 1.863520e-01 5.539792e+00 7979,.. 1.414699
95 suppressed 0.688724 -15716.280579 1.005432 1.903584e-01 5.281787e+00 7310,.. 1.375747
123 suppressed 0.688712 -16358.697414 0.999613 4.169621e-01 2.397371e+00 1103,.. 1.001356
133 suppressed 0.688627 -16714.913398 1.000219 2.615284e-01 3.824512e+00 3078,.. 1.170829
83 suppressed 0.688541 -13193.813234 1.003698 9.415072e-02 1.066054e+01 7463,.. 1.722177
140 suppressed 0.688419 -16201.718296 0.996574 2.026823e-01 4.916926e+00 7043,.. 1.312820
193 suppressed 0.688277 -12711.452094 1.036695 9.553550e-02 1.085141e+01 8309,.. 1.678374
4 suppressed 0.688199 -14171.902627 1.007360 1.889832e-01 5.330423e+00 4795,.. 1.387307
145 suppressed 0.687912 -16847.358892 1.002725 1.902105e-01 5.271658e+00 4759,.. 1.292409
191 suppressed 0.687045 -14053.780496 1.023599 1.200040e-01 8.529711e+00 3649,.. 1.613660
182 suppressed 0.687007 -14575.935787 0.990127 1.109742e-01 8.922138e+00 3799,.. 1.639604
118 suppressed 0.686961 -14280.962116 0.989060 1.593389e-01 6.207275e+00 5230,.. 1.522630
0 suppressed 0.686316 -18402.724732 1.006706 4.111071e-01 2.448769e+00 468,.. 0.951009
165 suppressed 0.686287 -18092.935773 1.011781 5.367038e-01 1.885176e+00 552,.. 0.863286
38 suppressed 0.686258 -15721.053149 0.997985 4.249321e-01 2.348576e+00 59,.. 1.047510
137 suppressed 0.685782 -16064.398904 0.994807 2.989403e-01 3.327778e+00 137,.. 1.353409
74 suppressed 0.685781 -13890.959743 1.001878 2.735926e-01 3.661935e+00 6,.. 1.409137
99 suppressed 0.685694 -18151.931068 0.991348 4.382392e-01 2.262116e+00 165,.. 0.962420
19 suppressed 0.685242 -11562.276563 1.000928 8.590960e-02 1.165095e+01 3758,.. 1.756323
173 suppressed 0.685103 -14084.217988 0.997685 2.207472e-01 4.519580e+00 -78,.. 1.579495
111 suppressed 0.684769 -15319.082471 1.008228 1.722941e-01 5.851782e+00 5109,.. 1.495716
3 suppressed 0.684431 -15401.274495 1.007629 1.911576e-01 5.271197e+00 677,.. 1.498431
121 suppressed 0.684407 -13937.650487 0.992099 2.063439e-01 4.807990e+00 729,.. 1.446902
73 suppressed 0.684283 -12813.992369 0.985977 2.417026e-01 4.079299e+00 1026,.. 1.320159
128 suppressed 0.684202 -15141.216197 1.003128 2.139854e-01 4.687836e+00 -241,.. 1.564046
106 suppressed 0.684201 -11086.710726 0.997053 1.895628e-01 5.259753e+00 -153,.. 1.716338
154 suppressed 0.684183 -13200.388406 1.002949 1.029199e-01 9.744952e+00 697,.. 1.837202
45 suppressed 0.684162 -15173.224665 0.997116 1.603302e-01 6.219140e+00 904,.. 1.560863
129 suppressed 0.684059 -15488.045282 1.009765 2.650104e-01 3.810284e+00 611,.. 1.314235
148 suppressed 0.683925 -17228.786501 0.999101 2.611584e-01 3.825651e+00 478,.. 1.230638
48 suppressed 0.683843 -12645.591557 0.995942 1.936052e-01 5.144191e+00 -76,.. 1.674068
18 suppressed 0.683652 -14474.068822 0.996068 1.545802e-01 6.443700e+00 3415,.. 1.563079
6 suppressed 0.683599 -14688.853454 0.996305 1.613292e-01 6.175602e+00 4630,.. 1.549996
160 suppressed 0.682889 -17833.357924 1.004205 3.432141e-01 2.925884e+00 683,.. 1.044587
12 suppressed 0.682880 -14569.677210 1.002315 1.460788e-01 6.861466e+00 293,.. 1.648877
46 suppressed 0.682860 -14360.525184 0.992889 3.322933e-01 2.987989e+00 1016,.. 1.098568
127 suppressed 0.682661 -15937.728140 1.000485 1.556259e-01 6.428782e+00 3187,.. 1.533165
51 suppressed 0.682285 -13358.780025 1.002176 8.859939e-02 1.131132e+01 3474,.. 1.810449
53 suppressed 0.682245 -16159.402162 1.002702 2.726855e-01 3.677137e+00 -85,.. 1.328096
157 suppressed 0.682244 -16727.249768 0.997462 2.422436e-01 4.117597e+00 967,.. 1.239346
147 suppressed 0.682243 -10522.042460 0.986185 1.273125e-02 7.746174e+01 9101,.. 2.018860
85 suppressed 0.681994 -13788.694471 0.987880 7.037748e-02 1.403688e+01 3900,.. 1.875761
52 suppressed 0.681892 -13230.769258 1.005784 6.794992e-02 1.480185e+01 3584,.. 1.869381
61 suppressed 0.681854 -12949.802093 1.078937 3.664788e-01 2.944063e+00 1165,.. 1.333286
196 suppressed 0.681745 -11542.925092 0.991728 3.265836e-02 3.036676e+01 5931,.. 2.082934
82 suppressed 0.681706 -16721.547422 1.008883 2.154218e-01 4.683292e+00 2154,.. 1.336659
125 suppressed 0.681639 -1996.077350 1.003199 1.459453e-02 6.873802e+01 16535,.. 2.232660
22 suppressed 0.681514 -17385.697656 0.999252 4.608787e-01 2.168145e+00 188,.. 0.989101
35 suppressed 0.681338 -11532.083488 1.009510 5.626905e-02 1.794078e+01 2213,.. 1.937487
56 suppressed 0.681278 -11249.272314 0.999153 7.780888e-02 1.284111e+01 1833,.. 1.817969
159 suppressed 0.681234 -10414.610430 1.008112 6.946986e-02 1.451149e+01 1106,.. 1.855901
55 suppressed 0.681200 -15262.818769 0.992735 1.845043e-01 5.380555e+00 2006,.. 1.428374
135 suppressed 0.680914 -17342.398900 1.000921 3.200151e-01 3.127732e+00 2799,.. 1.143231
2 suppressed 0.680594 -17428.994679 1.005258 5.347457e-01 1.879881e+00 115,.. 0.826383
116 suppressed 0.680548 -12977.217263 1.009804 1.789104e-01 5.644190e+00 1113,.. 1.403936
54 suppressed 0.680547 -14423.721571 1.000461 1.015597e-01 9.850965e+00 1511,.. 1.701792
187 suppressed 0.680056 -17443.841353 0.995991 3.956357e-01 2.517445e+00 -1262,.. 1.141352
112 suppressed 0.679939 -12477.176909 0.992692 2.108629e-01 4.707758e+00 993,.. 1.459250
76 suppressed 0.679898 -10564.005567 0.978742 4.227548e-02 2.315153e+01 2989,.. 2.119125
5 suppressed 0.678769 -14572.544316 0.998531 9.389081e-02 1.063503e+01 1286,.. 1.752676
100 suppressed 0.678695 -15289.721683 1.004637 1.653401e-01 6.076186e+00 480,.. 1.711596
167 suppressed 0.678046 -19724.382139 1.015811 5.726492e-01 1.773880e+00 1528,.. 0.556397
124 suppressed 0.677731 -12629.052363 1.004765 8.832236e-02 1.137611e+01 1338,.. 1.644186
67 suppressed 0.677316 -14177.430824 1.007285 1.029000e-01 9.788969e+00 1343,.. 1.608008
180 suppressed 0.676378 -5191.839972 0.999909 1.740217e-01 5.745887e+00 -4690,.. 2.882839
126 suppressed 0.675290 -9392.032255 1.001299 2.211596e-01 4.527493e+00 270,.. 1.803445
36 suppressed 0.674456 -4005.721485 1.024127 2.199928e-02 4.655276e+01 -501,.. 2.249018
90 suppressed 0.674130 -1477.444289 1.007652 1.485172e-01 6.784752e+00 161,.. 2.049748
27 suppressed 0.672611 -10589.702972 0.995451 4.642809e-02 2.144070e+01 577,.. 2.114644
20 suppressed 0.671560 -11984.804438 1.011126 3.853435e-02 2.623961e+01 225,.. 2.133795
71 suppressed 0.671365 -10616.850003 1.002407 3.113391e-02 3.219662e+01 689,.. 2.153088
153 suppressed 0.664325 -16626.876826 0.989522 1.990431e-01 4.971393e+00 -1521,.. 1.287714
1 suppressed 0.657098 -12774.305450 1.012679 1.648924e-01 6.141451e+00 -5217,.. 1.893854
102 suppressed 0.656719 -12330.372926 1.003932 1.359755e-01 7.383182e+00 -5682,.. 2.019963
11 suppressed 0.633816 -16151.612530 1.283313 3.320448e-01 3.864880e+00 -903,.. 1.450578
115 suppressed 0.622930 -15283.432578 0.806148 2.157546e-01 3.736412e+00 -2852,.. 1.372022
185 suppressed 0.611097 -3939.913448 0.961641 3.961881e-02 2.427233e+01 7606,.. 2.687387
122 suppressed 0.544747 -1761.572649 1.010639 1.255845e-03 8.047480e+02 3010,.. 3.692221
64 suppressed 0.540184 20091.732530 1.003961 8.829830e-07 1.137010e+06 -5939,.. 4.375666
169 suppressed 0.537554 7974.476636 1.002667 1.379499e-02 7.268342e+01 -1750,.. 3.566194
92 suppressed 0.510130 10238.574405 0.996255 3.308198e-04 3.011475e+03 10172,.. 3.958166
In [12]:
df_repetitions=df_repetitions[:-int(len(df_repetitions)/20)] #delete 5% worst
In [13]:
### compare energy matrices of ensemble using PCA
print('I: Histogram of the regression coefficients r obtained by repeated optimizaion with the subset.')
df_repetitions['r'].plot(kind='hist')
plt.show()

pca = PCA(n_components=2)
pca_2c=pca.fit_transform(df_repetitions['energies'].tolist())    
df_repetitions[['PCA1', 'PCA2']]=pca_2c
print('I: 2-dimensional PCA explained %i %% of variance.' %(sum(pca.explained_variance_ratio_)*100))
if sum(pca.explained_variance_ratio_)<0.5:
      print('W: 2-dimensional PCA explained only %i %% of variance' %(sum(pca.explained_variance_ratio_)*100))

print('I: Visualization of the PCA with the regression quality by color.')        
df_repetitions.plot(x='PCA1', y='PCA2', kind='scatter', c='r',cmap=cm.coolwarm, edgecolors='black', linewidths=0.3)
I: Histogram of the regression coefficients r obtained by repeated optimizaion with the subset.
I: 2-dimensional PCA explained 80 % of variance.
I: Visualization of the PCA with the regression quality by color.
/home/GLipps/.local/lib/python3.8/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute `n_features_` was deprecated in version 1.2 and will be removed in 1.4. Use `n_features_in_` instead.
  warnings.warn(msg, category=FutureWarning)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb275d513d0>
In [14]:
mf.display_df(df_repetitions, nuc_type=NUC_TYPE)
model r G0 max binding min binding ratio energies information logo PCA1 PCA2
184 suppressed 0.709102 -8369.682846 0.961111 0.305767 3.143276 -6305,.. 2.372494 17988.736468 -39.460457
60 suppressed 0.709079 -6483.433347 0.969193 0.305267 3.174901 -6098,.. 2.434594 20391.888512 913.429057
68 suppressed 0.708971 -9111.459686 0.970340 0.311781 3.112246 -5960,.. 2.360772 16887.932155 -386.841108
63 suppressed 0.708837 -8917.596818 0.968301 0.311506 3.108449 -4911,.. 2.374628 16443.067048 616.642582
144 suppressed 0.708727 -9858.288782 0.975751 0.310855 3.138928 -6247,.. 2.281839 15918.681832 -617.144040
114 suppressed 0.708587 -10537.550819 0.977608 0.311074 3.142686 -5771,.. 2.270755 14895.509678 -651.561250
108 suppressed 0.708546 -11716.668946 0.982724 0.324349 3.029831 -5044,.. 2.231729 12947.405513 -655.573760
107 suppressed 0.708505 -10799.940918 0.983687 0.320787 3.066482 -4894,.. 2.240343 14256.905212 40.058050
98 suppressed 0.708344 -9607.492522 0.961975 0.321038 2.996457 -6141,.. 2.295675 16120.051120 -1053.813118
43 suppressed 0.708297 -12027.574721 0.980347 0.323265 3.032646 -4163,.. 2.262535 12051.655413 -149.126292
69 suppressed 0.708239 -12542.382842 0.976880 0.327051 2.986939 -4521,.. 2.152745 11728.799497 -696.268337
75 suppressed 0.708180 -10861.826932 0.971977 0.321625 3.022080 -6545,.. 2.108372 14972.576941 -1500.001547
171 suppressed 0.708176 -12574.545868 0.961221 0.316859 3.033594 -4297,.. 2.181432 11294.846696 -482.868040
130 suppressed 0.708125 -10091.505729 0.960229 0.328838 2.920067 -6843,.. 2.222277 15818.802865 -1867.920673
96 suppressed 0.708117 -12468.372975 0.978621 0.334799 2.923006 -4713,.. 2.165810 11785.107054 -946.389459
32 suppressed 0.708044 -10567.029564 0.990502 0.330981 2.992622 -6730,.. 2.128858 15582.704727 -1408.958697
94 suppressed 0.708014 -6377.124055 0.975092 0.300255 3.247548 -7061,.. 2.493042 17823.638354 896.963707
14 suppressed 0.707989 -12696.448826 0.978061 0.317190 3.083517 -3945,.. 2.186752 11140.409781 -188.148214
44 suppressed 0.707945 -10909.425345 0.978234 0.327714 2.985023 -6109,.. 2.180312 14555.684536 -1390.356182
49 suppressed 0.707899 -11768.069798 0.976074 0.312281 3.125624 -5124,.. 2.075452 13278.292793 -277.214157
28 suppressed 0.707751 -10773.722431 0.989636 0.326360 3.032342 -6965,.. 2.027353 15506.318872 -1542.864470
142 suppressed 0.707746 -11610.950337 0.966895 0.319337 3.027819 -5455,.. 2.148604 13429.938844 -1284.337936
21 suppressed 0.707740 -12036.418943 0.984355 0.334559 2.942251 -5711,.. 2.061603 13102.139528 -1443.360373
8 suppressed 0.707738 -13148.663195 0.978205 0.314652 3.108849 -3911,.. 2.085305 10897.973550 -263.869682
195 suppressed 0.707713 -9692.825219 0.978272 0.319962 3.057459 -7421,.. 2.110664 17109.744430 -1946.071426
149 suppressed 0.707620 -13161.544905 0.973257 0.338790 2.872742 -4232,.. 2.094864 10617.249614 -914.241602
87 suppressed 0.707618 -13553.509385 0.977256 0.332720 2.937175 -3862,.. 2.075937 10011.980106 -723.830110
104 suppressed 0.707603 -12612.377303 0.975036 0.318167 3.064541 -4193,.. 2.104714 11188.590870 135.225999
152 suppressed 0.707585 -11501.727201 0.973607 0.344324 2.827590 -5912,.. 2.155174 13559.385909 -1690.385163
141 suppressed 0.707575 -9973.717176 0.985053 0.340488 2.893062 -7111,.. 2.125851 16569.518635 -1929.464724
16 suppressed 0.707554 -13159.914853 0.978340 0.337995 2.894541 -4103,.. 2.122826 10419.538375 -734.708127
146 suppressed 0.707519 -11236.377197 0.979721 0.329021 2.977684 -6075,.. 2.101092 14279.141869 -1336.063082
65 suppressed 0.707508 -13867.033125 0.978684 0.336970 2.904360 -4009,.. 1.985818 9838.838040 -726.692712
151 suppressed 0.707465 -12529.011975 0.983077 0.327270 3.003867 -3756,.. 2.150554 11369.953430 -285.043877
13 suppressed 0.707361 -12294.144056 0.969659 0.332895 2.912805 -5368,.. 2.036012 12540.229977 -1344.007264
120 suppressed 0.707344 -14248.122927 0.974966 0.339749 2.869665 -3806,.. 1.933827 9220.165879 -611.947955
183 suppressed 0.707320 -13547.375868 0.991946 0.319046 3.109102 -4466,.. 1.959040 10609.975431 -742.146955
192 suppressed 0.707311 -14071.357127 0.987226 0.342600 2.881567 -4029,.. 1.942493 9673.672157 -787.815013
172 suppressed 0.707291 -14202.033901 0.985084 0.327455 3.008301 -3611,.. 1.929793 9373.320638 -211.111435
109 suppressed 0.707267 -13278.828687 0.990597 0.326223 3.036563 -4060,.. 2.121135 10309.577017 -651.578072
29 suppressed 0.707244 -11582.318621 0.974590 0.322279 3.024056 -5820,.. 2.091526 13566.264422 -1485.239749
70 suppressed 0.707232 -14426.213016 0.977949 0.331438 2.950627 -3647,.. 1.908878 8976.957737 -370.748044
24 suppressed 0.707171 -13452.424348 0.969906 0.325620 2.978647 -4469,.. 1.867069 11042.295215 -487.286521
31 suppressed 0.707094 -14150.017156 0.972750 0.344158 2.826460 -3919,.. 1.960341 9255.479365 -842.105187
188 suppressed 0.707090 -13118.532865 0.984173 0.320385 3.071847 -3875,.. 2.115472 10786.489404 -463.035278
62 suppressed 0.707049 -14663.897962 0.976094 0.335843 2.906403 -3559,.. 1.857300 8703.187417 -384.511327
34 suppressed 0.707022 -14141.075086 0.973570 0.341512 2.850763 -4104,.. 1.887372 9576.593530 -841.410279
161 suppressed 0.706965 -14284.886682 0.974979 0.332367 2.933444 -3876,.. 1.898711 9191.852405 -693.988669
78 suppressed 0.706961 -13842.987845 0.981583 0.330590 2.969190 -4185,.. 1.875746 10164.313831 -395.998395
41 suppressed 0.706938 -14000.601372 0.985296 0.347603 2.834546 -4135,.. 1.951028 9696.865847 -1041.866914
89 suppressed 0.706883 -14046.231784 0.970092 0.348089 2.786907 -3878,.. 1.977826 9292.257971 -985.584328
101 suppressed 0.706842 -14628.003671 0.974030 0.335238 2.905489 -3577,.. 1.819633 8855.402579 -269.816197
30 suppressed 0.706813 -14424.234893 0.970130 0.344662 2.814729 -3763,.. 1.887071 9008.289970 -664.068733
119 suppressed 0.706809 -14484.700741 0.978064 0.324069 3.018073 -3443,.. 1.896427 8883.805088 -386.689879
113 suppressed 0.706800 -10651.474175 1.001419 0.318557 3.143612 -6723,.. 1.865427 16122.804963 -891.883451
80 suppressed 0.706772 -14603.283640 0.977040 0.335609 2.911243 -3647,.. 1.819496 8865.562810 -331.501238
166 suppressed 0.706756 -13080.277537 0.967184 0.326186 2.965130 -3613,.. 1.940005 10915.406620 522.548226
158 suppressed 0.706714 -13754.512853 0.973007 0.312595 3.112678 -4105,.. 1.823113 10366.679558 -83.706624
150 suppressed 0.706676 -13905.816204 0.980039 0.323361 3.030794 -3340,.. 2.017035 9531.045819 -319.682474
17 suppressed 0.706649 -14516.754898 0.992478 0.330099 3.006606 -3786,.. 1.775090 9313.163959 -156.206709
50 suppressed 0.706649 -12385.739588 1.000124 0.328546 3.044093 -5813,.. 1.821217 13251.890204 -948.244839
155 suppressed 0.706577 -14871.704602 0.971259 0.342309 2.837377 -3427,.. 1.836385 8236.834081 -613.066781
39 suppressed 0.706560 -14834.369569 0.984353 0.346666 2.839485 -3633,.. 1.828666 8510.064049 -559.178396
57 suppressed 0.706538 -14560.121811 0.977939 0.350854 2.787310 -3520,.. 1.913195 8596.633855 -796.063170
10 suppressed 0.706332 -14348.438630 0.976600 0.354370 2.755879 -3950,.. 1.891869 9118.917543 -1099.532794
190 suppressed 0.706238 -15146.146701 0.979361 0.351310 2.787740 -3404,.. 1.779870 7940.328469 -578.273083
59 suppressed 0.706191 -14736.686310 0.973408 0.338667 2.874229 -3279,.. 1.843195 8652.452251 -247.935300
189 suppressed 0.706132 -14266.456449 0.992509 0.329895 3.008561 -3416,.. 1.956562 9061.110316 -611.349095
132 suppressed 0.706020 -15124.490816 0.991018 0.328418 3.017553 -3341,.. 1.679107 8422.090086 41.075251
170 suppressed 0.705971 -15037.345084 0.980479 0.351953 2.785820 -3263,.. 1.822034 7941.943415 -699.557822
88 suppressed 0.705764 -15043.458779 0.982262 0.328081 2.993960 -3285,.. 1.662796 8406.987892 263.222915
117 suppressed 0.705724 -15392.428503 0.985658 0.339930 2.899592 -3077,.. 1.660002 7906.449252 38.793113
197 suppressed 0.705694 -15355.968739 0.995124 0.337871 2.945280 -3235,.. 1.636952 8052.993739 131.759162
178 suppressed 0.705510 -11919.848967 0.974559 0.351235 2.774662 -6017,.. 2.081734 12799.283068 -2194.302833
176 suppressed 0.704807 -12282.556756 0.987461 0.349794 2.822975 -6343,.. 1.853654 13141.759251 -1982.224209
40 suppressed 0.704783 -12441.101308 0.985508 0.353803 2.785473 -5786,.. 1.915172 12555.907053 -2005.633858
86 suppressed 0.704288 -16117.681316 0.988564 0.359331 2.751127 -2946,.. 1.499881 6960.088832 105.578610
66 suppressed 0.703571 -15843.789595 0.986702 0.373186 2.643999 -2915,.. 1.486631 7441.454196 381.029731
174 suppressed 0.703511 -16671.185196 0.990813 0.383857 2.581205 -2717,.. 1.411135 6036.140476 -113.561488
25 suppressed 0.701207 -17758.332414 0.995580 0.427967 2.326299 -2219,.. 1.100871 4871.096497 223.296194
79 suppressed 0.701059 -5687.560070 0.977325 0.345522 2.828547 -9669,.. 2.695829 20156.895412 -3690.730365
179 suppressed 0.693248 -17615.624014 0.993124 0.455598 2.179827 -2091,.. 1.166737 4350.829720 -501.274535
186 suppressed 0.691515 -16146.337987 1.006219 0.286233 3.515384 952,.. 1.437103 -8030.463404 -4607.283486
168 suppressed 0.691327 -15643.989028 0.996489 0.253229 3.935136 1011,.. 1.550288 -8047.545094 -5145.121917
164 suppressed 0.691300 -13069.601688 0.999684 0.278029 3.595614 1013,.. 1.478663 -13762.710963 -7542.301777
156 suppressed 0.691226 -16913.401053 1.003214 0.338072 2.967457 997,.. 1.218829 -7210.382605 -4059.863095
199 suppressed 0.690879 -15436.036410 0.993807 0.299786 3.315060 1128,.. 1.381782 -10376.910416 -4737.296761
26 suppressed 0.690621 -13797.222377 0.995097 0.371134 2.681237 994,.. 1.259845 948.184353 15729.145744
181 suppressed 0.690596 -15920.557199 0.998980 0.403865 2.473546 1023,.. 1.132092 -512.402640 12898.333589
81 suppressed 0.690278 -17542.369990 1.007353 0.325826 3.091692 2059,.. 1.091616 -8972.122159 -915.329515
162 suppressed 0.690084 -12729.382776 0.991340 0.195089 5.081478 910,.. 1.845683 -10531.822198 -8408.354218
77 suppressed 0.689858 -15244.207265 0.997759 0.232009 4.300517 3003,.. 1.252288 -13232.632586 -2368.160018
93 suppressed 0.689778 -15784.842901 1.002332 0.192539 5.205860 3793,.. 1.347076 -12185.021913 -1035.336004
177 suppressed 0.689680 -14850.846865 0.993328 0.156849 6.333020 4782,.. 1.434956 -13733.972834 -1295.737101
198 suppressed 0.689632 -12986.871927 1.008315 0.126702 7.958144 6869,.. 1.571041 -16758.924100 -2027.061466
194 suppressed 0.689611 -15602.098567 1.003485 0.159223 6.302395 4810,.. 1.428945 -12295.416685 -628.754509
33 suppressed 0.689609 -12278.138207 0.993908 0.203980 4.872583 823,.. 1.806295 -11488.238423 -8730.740490
105 suppressed 0.689552 -13609.467077 0.992334 0.191692 5.176705 3309,.. 1.355005 -15902.533693 -3492.175824
58 suppressed 0.689512 -17358.138957 1.005609 0.418978 2.400146 870,.. 1.045546 -275.047407 9912.585816
110 suppressed 0.689484 -13592.997812 1.004792 0.135385 7.421760 5089,.. 1.514374 -15763.480894 -2383.269832
91 suppressed 0.689472 -15107.581727 1.004360 0.261308 3.843587 1780,.. 1.380881 -12033.016104 -3852.444167
42 suppressed 0.689452 -14880.210463 1.006580 0.145420 6.921896 5140,.. 1.462380 -13567.265276 -1411.840267
175 suppressed 0.689431 -15614.763683 1.002749 0.148155 6.768228 5129,.. 1.454054 -12278.057339 -350.524927
131 suppressed 0.689420 -14486.356792 1.004350 0.133203 7.540000 9128,.. 1.517692 -13829.211131 101.223296
37 suppressed 0.689415 -10870.138328 0.983454 0.108473 9.066369 7527,.. 1.617936 -20157.999072 -3678.411790
138 suppressed 0.689389 -15210.219158 1.001756 0.141186 7.095299 7125,.. 1.479178 -12716.695859 -8.047174
163 suppressed 0.689322 -17016.173404 1.001110 0.227116 4.407925 3805,.. 1.228709 -10099.970174 299.945004
23 suppressed 0.689285 -15334.121138 0.998403 0.160125 6.235128 6096,.. 1.426229 -12760.198282 -390.276762
139 suppressed 0.689275 -14968.738712 0.985607 0.161152 6.115998 5297,.. 1.462571 -13528.407177 -648.852833
136 suppressed 0.689245 -15624.305603 0.995569 0.163757 6.079542 3770,.. 1.443532 -11868.389636 -1316.610979
47 suppressed 0.689201 -13897.680234 1.000110 0.111033 9.007290 6088,.. 1.572170 -15093.067312 -1599.385039
84 suppressed 0.689147 -13428.710600 0.999244 0.164464 6.075770 4204,.. 1.417729 -16165.493664 -3291.329141
7 suppressed 0.689108 -16304.036773 1.016735 0.202429 5.022680 5265,.. 1.311394 -11564.508088 144.642129
15 suppressed 0.688938 -16839.026618 0.990299 0.210602 4.702221 3149,.. 1.294219 -9719.281488 -601.259062
134 suppressed 0.688918 -16293.358489 1.007546 0.194406 5.182676 6301,.. 1.329639 -11240.047609 653.065478
97 suppressed 0.688906 -17070.717136 0.994390 0.221341 4.492569 4240,.. 1.222828 -9842.720773 434.210854
72 suppressed 0.688881 -16756.222537 0.995600 0.252192 3.947792 3284,.. 1.182881 -11022.350227 -168.612044
143 suppressed 0.688853 -17184.192814 1.003904 0.349251 2.874449 724,.. 1.212848 -6783.152599 -3831.293475
9 suppressed 0.688788 -16929.817999 0.999057 0.228267 4.376702 4598,.. 1.233442 -10119.319360 451.592529
103 suppressed 0.688759 -15021.762842 1.032351 0.186352 5.539792 7979,.. 1.414699 -13797.869750 123.904654
95 suppressed 0.688724 -15716.280579 1.005432 0.190358 5.281787 7310,.. 1.375747 -12194.990607 541.750598
123 suppressed 0.688712 -16358.697414 0.999613 0.416962 2.397371 1103,.. 1.001356 -10978.773308 -3042.620950
133 suppressed 0.688627 -16714.913398 1.000219 0.261528 3.824512 3078,.. 1.170829 -11061.917755 -328.276312
83 suppressed 0.688541 -13193.813234 1.003698 0.094151 10.660543 7463,.. 1.722177 -14925.290440 -625.188691
140 suppressed 0.688419 -16201.718296 0.996574 0.202682 4.916926 7043,.. 1.312820 -11522.024944 928.752711
193 suppressed 0.688277 -12711.452094 1.036695 0.095536 10.851408 8309,.. 1.678374 -17111.195884 -1529.510671
4 suppressed 0.688199 -14171.902627 1.007360 0.188983 5.330423 4795,.. 1.387307 -15658.897913 -1668.410344
145 suppressed 0.687912 -16847.358892 1.002725 0.190211 5.271658 4759,.. 1.292409 -10438.952588 522.198433
191 suppressed 0.687045 -14053.780496 1.023599 0.120004 8.529711 3649,.. 1.613660 -13963.670293 -3391.008709
182 suppressed 0.687007 -14575.935787 0.990127 0.110974 8.922138 3799,.. 1.639604 -12521.087606 -2486.377752
118 suppressed 0.686961 -14280.962116 0.989060 0.159339 6.207275 5230,.. 1.522630 -14882.242897 -875.005921
0 suppressed 0.686316 -18402.724732 1.006706 0.411107 2.448769 468,.. 0.951009 -72.108522 7512.865021
165 suppressed 0.686287 -18092.935773 1.011781 0.536704 1.885176 552,.. 0.863286 -553.448714 9048.117914
38 suppressed 0.686258 -15721.053149 0.997985 0.424932 2.348576 59,.. 1.047510 1151.333170 -8026.376234
137 suppressed 0.685782 -16064.398904 0.994807 0.298940 3.327778 137,.. 1.353409 714.362544 -7789.337444
74 suppressed 0.685781 -13890.959743 1.001878 0.273593 3.661935 6,.. 1.409137 1486.368619 -10880.920070
99 suppressed 0.685694 -18151.931068 0.991348 0.438239 2.262116 165,.. 0.962420 143.099740 -4775.648947
19 suppressed 0.685242 -11562.276563 1.000928 0.085910 11.650946 3758,.. 1.756323 -17150.729304 -5979.677845
173 suppressed 0.685103 -14084.217988 0.997685 0.220747 4.519580 -78,.. 1.579495 776.186502 -10829.826932
111 suppressed 0.684769 -15319.082471 1.008228 0.172294 5.851782 5109,.. 1.495716 -13541.263052 319.247043
3 suppressed 0.684431 -15401.274495 1.007629 0.191158 5.271197 677,.. 1.498431 -3887.050902 11832.690672
121 suppressed 0.684407 -13937.650487 0.992099 0.206344 4.807990 729,.. 1.446902 -4714.716413 15926.541119
73 suppressed 0.684283 -12813.992369 0.985977 0.241703 4.079299 1026,.. 1.320159 -2980.046488 19435.732685
128 suppressed 0.684202 -15141.216197 1.003128 0.213985 4.687836 -241,.. 1.564046 780.584262 -9183.223335
106 suppressed 0.684201 -11086.710726 0.997053 0.189563 5.259753 -153,.. 1.716338 2233.209004 -14585.430936
154 suppressed 0.684183 -13200.388406 1.002949 0.102920 9.744952 697,.. 1.837202 3049.970181 13541.572505
45 suppressed 0.684162 -15173.224665 0.997116 0.160330 6.219140 904,.. 1.560863 -3971.550873 12201.658411
129 suppressed 0.684059 -15488.045282 1.009765 0.265010 3.810284 611,.. 1.314235 -4093.749215 13096.400435
148 suppressed 0.683925 -17228.786501 0.999101 0.261158 3.825651 478,.. 1.230638 -3374.787325 8589.258295
48 suppressed 0.683843 -12645.591557 0.995942 0.193605 5.144191 -76,.. 1.674068 1881.713401 -12311.944500
18 suppressed 0.683652 -14474.068822 0.996068 0.154580 6.443700 3415,.. 1.563079 -14856.213196 -711.253382
6 suppressed 0.683599 -14688.853454 0.996305 0.161329 6.175602 4630,.. 1.549996 -14574.284375 -186.800155
160 suppressed 0.682889 -17833.357924 1.004205 0.343214 2.925884 683,.. 1.044587 -3318.901153 8368.631468
12 suppressed 0.682880 -14569.677210 1.002315 0.146079 6.861466 293,.. 1.648877 -3529.818448 10832.740684
46 suppressed 0.682860 -14360.525184 0.992889 0.332293 2.987989 1016,.. 1.098568 -4456.108653 16979.962915
127 suppressed 0.682661 -15937.728140 1.000485 0.155626 6.428782 3187,.. 1.533165 -12237.046199 802.416662
51 suppressed 0.682285 -13358.780025 1.002176 0.088599 11.311318 3474,.. 1.810449 -16126.711937 -524.810599
53 suppressed 0.682245 -16159.402162 1.002702 0.272686 3.677137 -85,.. 1.328096 851.734209 -7557.616526
157 suppressed 0.682244 -16727.249768 0.997462 0.242244 4.117597 967,.. 1.239346 -3805.017817 10295.656583
147 suppressed 0.682243 -10522.042460 0.986185 0.012731 77.461739 9101,.. 2.018860 -16215.235576 -4802.045548
85 suppressed 0.681994 -13788.694471 0.987880 0.070377 14.036882 3900,.. 1.875761 -14780.287494 57.114541
52 suppressed 0.681892 -13230.769258 1.005784 0.067950 14.801845 3584,.. 1.869381 -16036.259210 -219.535820
61 suppressed 0.681854 -12949.802093 1.078937 0.366479 2.944063 1165,.. 1.333286 1101.037027 -11917.682664
196 suppressed 0.681745 -11542.925092 0.991728 0.032658 30.366759 5931,.. 2.082934 -17887.792289 75.640505
82 suppressed 0.681706 -16721.547422 1.008883 0.215422 4.683292 2154,.. 1.336659 -11274.731301 646.542526
125 suppressed 0.681639 -1996.077350 1.003199 0.014595 68.738019 16535,.. 2.232660 -32050.233630 -2989.085850
22 suppressed 0.681514 -17385.697656 0.999252 0.460879 2.168145 188,.. 0.989101 676.382955 -5099.198526
35 suppressed 0.681338 -11532.083488 1.009510 0.056269 17.940776 2213,.. 1.937487 -18378.049629 -350.928851
56 suppressed 0.681278 -11249.272314 0.999153 0.077809 12.841112 1833,.. 1.817969 -19489.256940 -1677.140678
159 suppressed 0.681234 -10414.610430 1.008112 0.069470 14.511495 1106,.. 1.855901 -20390.751257 -1226.931698
55 suppressed 0.681200 -15262.818769 0.992735 0.184504 5.380555 2006,.. 1.428374 -13638.777553 -173.180604
135 suppressed 0.680914 -17342.398900 1.000921 0.320015 3.127732 2799,.. 1.143231 -10647.593668 962.817510
2 suppressed 0.680594 -17428.994679 1.005258 0.534746 1.879881 115,.. 0.826383 287.799554 -5408.307500
116 suppressed 0.680548 -12977.217263 1.009804 0.178910 5.644190 1113,.. 1.403936 -4870.379817 18616.430871
54 suppressed 0.680547 -14423.721571 1.000461 0.101560 9.850965 1511,.. 1.701792 -14086.914141 144.300659
187 suppressed 0.680056 -17443.841353 0.995991 0.395636 2.517445 -1262,.. 1.141352 1475.293307 -4671.355430
112 suppressed 0.679939 -12477.176909 0.992692 0.210863 4.707758 993,.. 1.459250 -3096.803002 20036.128168
76 suppressed 0.679898 -10564.005567 0.978742 0.042275 23.151530 2989,.. 2.119125 -19279.233928 -2060.458376
5 suppressed 0.678769 -14572.544316 0.998531 0.093891 10.635028 1286,.. 1.752676 -13347.699425 -397.128794
100 suppressed 0.678695 -15289.721683 1.004637 0.165340 6.076186 480,.. 1.711596 455.529710 11016.567392
167 suppressed 0.678046 -19724.382139 1.015811 0.572649 1.773880 1528,.. 0.556397 -6569.206957 1656.726880
124 suppressed 0.677731 -12629.052363 1.004765 0.088322 11.376107 1338,.. 1.644186 -3908.012959 17552.049509
67 suppressed 0.677316 -14177.430824 1.007285 0.102900 9.788969 1343,.. 1.608008 -3194.641493 14607.671961
180 suppressed 0.676378 -5191.839972 0.999909 0.174022 5.745887 -4690,.. 2.882839 15518.112104 -7909.211882
126 suppressed 0.675290 -9392.032255 1.001299 0.221160 4.527493 270,.. 1.803445 1494.543317 -14173.445001
36 suppressed 0.674456 -4005.721485 1.024127 0.021999 46.552764 -501,.. 2.249018 -23030.789649 -10900.791625
90 suppressed 0.674130 -1477.444289 1.007652 0.148517 6.784752 161,.. 2.049748 3158.878570 -21142.347173
27 suppressed 0.672611 -10589.702972 0.995451 0.046428 21.440698 577,.. 2.114644 406.930309 17613.347175
20 suppressed 0.671560 -11984.804438 1.011126 0.038534 26.239610 225,.. 2.133795 1904.593995 14842.245139
71 suppressed 0.671365 -10616.850003 1.002407 0.031134 32.196618 689,.. 2.153088 1409.231304 16489.111142
In [15]:
# visualisation of the motif with the highest r
print('I: Best motif according to r from the repeated optimizations.')
print('I: PCA components: %i, %i' %(df_repetitions.iloc[0]['PCA1'], df_repetitions.iloc[0]['PCA2']))
model_best_repetition=df_repetitions.iloc[0]['model']
model_best_repetition.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE) 
# store results and display stages
STAGES.append('best repetition', model_best_repetition)
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
I: Best motif according to r from the repeated optimizations.
I: PCA components: 17988, -39
I: energy matrix and logos:

       A      C     G     T
0 -6305  12275 -7599  1630
1  8790  -5524  4206 -7472
2   317  -2245  2231  -303
3  -829    993   978 -1143

I: summed absolute energies of each position:
0    27811
1    25994
2     5097
3     3945
dtype: int64

I: averaged summed energy over all positions: 15712
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -8569 +/- 9131
I: Plot of the Occupancy of a subsite as the function of the Free Energy G 
   overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
    lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe:
   binding:  0.30577 .. 0.96111 (ratio: 3.1)
I: number of probes: 3149
I: Pearson Correlation  r: 0.7091
I: mean absolute error: 0.1027
I: Classification performance AUROC: 0.8296
stage protein # probes motif length r AUROC G0 G0 fitted ratio max binding min binding energies model logo
0 best repetition T7 primase 3149 4 0.709102 0.829612 -8369.682846 True 3.143276 0.961111 0.305767 -6305,.. suppressed
In [16]:
# refit model to minimize mae
last_model=STAGES.df.at[max(STAGES.df.index),'model']
refitted_model_best_repetition=model_best_repetition.refit_mae(X,y)

# print & display main results
refitted_model_best_repetition.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE)

# store results and display stages
STAGES.append('mae optimzed', refitted_model_best_repetition)
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
Optimization terminated successfully.
         Current function value: 0.098388
         Iterations: 8
         Function evaluations: 1634
I: energy matrix and logos:

       A      C     G     T
0 -5884  12970 -7518   431
1  8824  -5437  4115 -7502
2   105  -2208  2341  -238
3  -482    976   745 -1238

I: summed absolute energies of each position:
0    26804
1    25880
2     4894
3     3442
dtype: int64

I: averaged summed energy over all positions: 15255
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -8575 +/- 9400
I: Plot of the Occupancy of a subsite as the function of the Free Energy G 
   overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
    lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe:
   binding:  0.31222 .. 0.95738 (ratio: 3.1)
I: number of probes: 3149
I: Pearson Correlation  r: 0.6958
I: mean absolute error: 0.0984
I: Classification performance AUROC: 0.8205
stage protein # probes motif length r AUROC G0 G0 fitted ratio max binding min binding energies model logo
0 best repetition T7 primase 3149 4 0.709102 0.829612 -8369.682846 True 3.143276 0.961111 0.305767 -6305,.. suppressed
1 mae optimzed T7 primase 3149 4 0.695813 0.820490 -8369.682846 True 3.143276 0.957377 0.312217 -5884,.. suppressed
In [17]:
### Based on the motif of CORE_MOTIF_LENGTH analyze the neigbouring positions 
### whether their inclusion can improve the quality of the motif
df_positions=model_best_repetition.investigate_extension_parallel(X, y, end5=3, end3=3, nuc_type=NUC_TYPE)

list_positions=df_positions.index[df_positions['+2%']].tolist()+[0] # list of positions with an increase of2% and default position 0
ext5=-min(list_positions)
ext3=max(list_positions)
print("I: It is suggested to extend the core motif at the 5' end by %i and at the 3' end by %i positions." %(ext5, ext3))
  0%|          | 0/6 [00:00<?, ?engine/s]
job5:   0%|          | 0/3 [00:00<?, ?tasks/s]
job3:   0%|          | 0/3 [00:00<?, ?tasks/s]
I: Optimization took 0.02 hours.
I: It is suggested to extend the core motif at the 5' end by 0 and at the 3' end by 0 positions.
In [18]:
### fit & predict optimization starting with extended energy matrix if extension appears to improve prediction

if ext5+ext3!=0: #extension suggestion from previous analysis of the bordering positions
    expanded_energies=model_best_repetition.energies_
    # append energies of single-optimized bordering positions to energies of central part
    if ext5!=0:
        energies_5=np.concatenate(df_positions['energies'][(df_positions.index<0) & (df_positions.index>=-ext5)].to_numpy())
        expanded_energies=np.concatenate((energies_5, expanded_energies))
    if ext3!=0:
        energies_3=np.concatenate(df_positions['energies'][(df_positions.index<=ext3) & (df_positions.index>0)].to_numpy().flatten())
        expanded_energies=np.concatenate((expanded_energies,  energies_3))

    mf.energies2logo(expanded_energies, nuc_type=NUC_TYPE)
    print('I: Optimization started with following extended motif.')
    expanded_motif_length=len(expanded_energies)//4
    
    
    model_extended=mf.findmotif(motif_length=expanded_motif_length, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS,
                   start=expanded_energies, fit_G0=True)

    start = time()
    model_extended.fit(X,y)
    print("Optimization took %.2f hours." % ((time() - start)/3600))

    # print & display main results
    model_extended.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE)

    # store results and display stages
    STAGES.append('extended', model_extended)
    mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
else:
    print('I: Motif is not extended based on previous analysis.')
I: Motif is not extended based on previous analysis.
In [19]:
### fit & predict optimization starting with extended energy matrix plus one bordering position on each side if current bordering position exceed the information of 0.25

last_model=STAGES.df.at[max(STAGES.df.index),'model']
I_5=mf.energies2information(last_model.energies_[0:4])>=0.25 #sufficient information content of 5' end position
I_3=mf.energies2information(last_model.energies_[-4:])>=0.25 #sufficient information content of 3' end position

if I_5 or I_3:
    print('I: At least one of the bordering positions of the current motif has an information content of at least 0.25. Extending.')
    expanded_energies_with_border=mf.modify_energies(last_model.energies_, end5=I_5, end3=I_3)  
    mf.energies2logo(expanded_energies_with_border, nuc_type=NUC_TYPE)
    motif_length_with_border=len(expanded_energies_with_border)//4

    model_with_border=mf.findmotif(motif_length=motif_length_with_border, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS,
                   start=expanded_energies_with_border, fit_G0=True)


    start = time()
    model_with_border.fit(X,y)
    print("Optimization took %.2f hours." % ((time() - start)/3600))

    # print & display main results
    model_with_border.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE)

    # store results and display stages
    STAGES.append('expanded, border', model_with_border)
    mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
else:
    print('I: Both bordering positions of the found motif have an information content below 0.25. No futher optimization required.')
I: At least one of the bordering positions of the current motif has an information content of at least 0.25. Extending.
Optimization took 0.25 hours.
I: energy matrix and logos:

       A      C     G     T
0   467   -139  -407    79
1 -6500  12755 -7629  1374
2  8439  -5499  4236 -7176
3   261  -2117  2001  -145
4  -940    976  1008 -1044

I: summed absolute energies of each position:
0     1094
1    28260
2    25351
3     4526
4     3970
dtype: int64

I: averaged summed energy over all positions: 12640
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -8898 +/- 9231
I: Plot of the Occupancy of a subsite as the function of the Free Energy G 
   overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
    lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe:
   binding:  0.25453 .. 0.98793 (ratio: 3.9)
I: number of probes: 3149
I: Pearson Correlation  r: 0.7171
I: mean absolute error: 0.1013
I: Classification performance AUROC: 0.8313
stage protein # probes motif length r AUROC G0 G0 fitted ratio max binding min binding energies model logo
0 best repetition T7 primase 3149 4 0.709102 0.829612 -8369.682846 True 3.143276 0.961111 0.305767 -6305,.. suppressed
1 mae optimzed T7 primase 3149 4 0.695813 0.820490 -8369.682846 True 3.143276 0.957377 0.312217 -5884,.. suppressed
2 expanded, border T7 primase 3149 5 0.717139 0.831264 -8730.333206 True 3.881376 0.987928 0.254530 467,.. suppressed
In [20]:
last_model=STAGES.df.at[max(STAGES.df.index),'model']
df_relevant_positions=last_model.explore_positions(X, y)
list_positions=df_relevant_positions.index[df_relevant_positions['-2%']].tolist() # list of positions with an increase of2% and default position 0
start_relevant=min(list_positions)
end_relevant=max(list_positions)
red5=-start_relevant
red3=end_relevant-len(df_relevant_positions)+1
print('I: The analysis suggests, that positions between %i to %i contribute significantly to the motif' %(start_relevant, end_relevant))
last_model=STAGES.df.at[max(STAGES.df.index),'model']

if (end_relevant-start_relevant+1)in STAGES.df['motif length'].to_list():
    print('I: No need for a further optimization. An optimization with motif length of %i has already been done.' %(end_relevant-start_relevant+1))
    print('I: Checking whether G0 has been chosen correctly.')
    last_model.investigate_G0(X, y)
else:
    print('I: Bordering positions only marginally contributing towards regression quality are dropped.')
    print('I: New start energy for motif optimization:')
    start_final_model=mf.modify_energies(last_model.energies_, end5=red5, end3=red3)
    mf.energies2logo(start_final_model, nuc_type=NUC_TYPE)
    final_model=mf.findmotif(motif_length=len(start_final_model)//4, protein_conc=PROT_CONC, 
                             both_strands=BOTH_STRANDS, start=start_final_model)

    start = time()
    final_model.fit(X,y)
    print("Optimization took %.2f hours." % ((time() - start)/3600))

    # print & display main results
    final_model.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE)
    
    print('I: Checking whether G0 has been chosen correctly.')
    final_model.investigate_G0(X, y)

    # store results and display stages
    STAGES.append('shrinked', final_model)
    mf.display_df(STAGES.df, nuc_type=NUC_TYPE)  
I: The analysis suggests, that positions between 1 to 4 contribute significantly to the motif
I: No need for a further optimization. An optimization with motif length of 4 has already been done.
I: Checking whether G0 has been chosen correctly.
I: Current G0 = -8730 J/mol (see red broken line in figure below) with r = 0.717.
I: Maximal r is 0.718 at G0=-5730 J/mol (see green broken line below).
I: Maximal occupancy of 2 is reached at G0=-10730 J/mol (see blue broken line below).
I: Maximal occupancy of 0.2 is reached at G0=-4730 J/mol (see blue broken line below).
I: G0 is in a range leading to maximal probe occupancy between 0.2 and 2. Good.
I: Maximal r is close to r achieved with current G0. Good.
In [21]:
# refit model to minimize mae
last_model=STAGES.df.at[max(STAGES.df.index),'model']
refitted_model=last_model.refit_mae(X,y)

# print & display main results
refitted_model.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE)

# store results and display stages
STAGES.append('final mae optimzed', refitted_model)
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
Optimization terminated successfully.
         Current function value: 0.096996
         Iterations: 14
         Function evaluations: 3483
I: energy matrix and logos:

       A      C     G     T
0   354    -41  -284   -28
1 -6129  13736 -7781   175
2  8557  -5309  4058 -7305
3  -484  -2105  2435   154
4  -479   1055   573 -1149

I: summed absolute energies of each position:
0      709
1    27823
2    25231
3     5178
4     3258
dtype: int64

I: averaged summed energy over all positions: 12440
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -8888 +/- 9642
I: Plot of the Occupancy of a subsite as the function of the Free Energy G 
   overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
    lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe:
   binding:  0.31321 .. 1.06356 (ratio: 3.9)
I: number of probes: 3149
I: Pearson Correlation  r: 0.6981
I: mean absolute error: 0.0970
I: Classification performance AUROC: 0.8165
stage protein # probes motif length r AUROC G0 G0 fitted ratio max binding min binding energies model logo
0 best repetition T7 primase 3149 4 0.709102 0.829612 -8369.682846 True 3.143276 0.961111 0.305767 -6305,.. suppressed
1 mae optimzed T7 primase 3149 4 0.695813 0.820490 -8369.682846 True 3.143276 0.957377 0.312217 -5884,.. suppressed
2 expanded, border T7 primase 3149 5 0.717139 0.831264 -8730.333206 True 3.881376 0.987928 0.254530 467,.. suppressed
3 final mae optimzed T7 primase 3149 5 0.698074 0.816455 -8730.333206 True 3.881376 1.063557 0.313207 354,.. suppressed
In [22]:
STAGES.df['energies'].apply(mf.energies2information)
Out[22]:
0    2.372494
1    2.338378
2    2.306898
3    2.309537
Name: energies, dtype: float64
In [24]:
STAGES.df.to_json('%s_%s-%s-%s_%s-%s.json' %(PROTEIN_NAME, datetime.now().year, datetime.now().month,datetime.now().day , datetime.now().hour, datetime.now().minute))
In [ ]: